Let's scrape some nuclear reactors

Our goal: Scrape a table of U.S. nuclear reactors into a CSV.

Import the libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

Fetch and parse the HTML

In [5]:
# define the url
URL = 'https://www.nrc.gov/reactors/operating/list-power-reactor-units.html'

# get the page
nrc_page = requests.get(URL)

# specify the encoding
nrc_page.encoding = 'UTF-8'

# turn it into soup
soup = BeautifulSoup(nrc_page.text, 'html.parser')

Find the table

In [8]:
reactor_table = soup.find('table')

Loop over the rows and write to CSV

In [27]:
with open('reactors.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=['name', 'link', 'docket', 'reactor_type',
                                                 'license', 'location', 'owner', 'region'])


    for row in reactor_table.find_all('tr')[1:]:
        # each <tr> has some <td> cells inside it; we'll move these into variables,
        # do some string manipulations and write to the CSV
        cells = row.find_all('td')

        # reactor name, detail page link and docket number are all part of the first cell
        # the .contents() method returns a list of a tag's children -->
        # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#contents-and-children
        name = cells[0].contents[0].string
        link = 'https://www.nrc.gov' + cells[0].contents[0]['href']
        docket = cells[0].contents[2].strip()
        # license number is in the second cell
        license = cells[1].string.strip()
        # reactor type is in the third cell
        reactor_type = cells[2].string.strip()

        # location is in the fourth cell
        location = cells[3].string.strip()
        # some of the locations have multiple internal spaces -- here's a trick for dealing with that
        # https://stackoverflow.com/a/1546251
        location = ' '.join(location.split())
        # owner is in the fifth cell
        owner = cells[4].contents[0].strip()
        # region is in the sixth cell
        region = cells[5].string.strip()

            'name': name,
            'link': link,
            'docket': docket,
            'reactor_type': reactor_type,
            'license': license,
            'location': location,
            'owner': owner,
            'region': region